/* -*- C -*- */
/*
 * Copyright (c) 2007 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#include "fftw-spu.h"
#include "../fftw-cell.h"

#define BLOCKSZ 64
void X(spu_do_transpose)(const struct transpose_context *t)
{
     int i, j, ni, nj;
     int n = t->n, nspe = t->nspe, my_id = t->my_id;
     int s1_bytes = t->s1_bytes;
     int s0_bytes = t->s0_bytes;
     int nblock;
     R *A, *B, *Aalign, *Balign;

     X(spu_alloc_reset)();
     A = X(spu_alloc)(BLOCKSZ * BLOCKSZ * 2 * sizeof(R) + ALIGNMENT);
     B = X(spu_alloc)(BLOCKSZ * BLOCKSZ * 2 * sizeof(R) + ALIGNMENT);

     nblock = 0;
     ni = BLOCKSZ;
     for (i = 0; i < n; i += ni) {
	  if (ni > n - i) ni = n - i;
	  nj = BLOCKSZ;
	  for (j = i; j < n; j += nj) {
	       if (nj > n - j) nj = n - j;

	       if ((nblock++ % nspe) != my_id)
		    continue; /* block is not ours */

	       if (i == j) {
		    /* diagonal block */
		    Aalign = 
			 ALIGN_LIKE(A, t->A + (i * s1_bytes + j * s0_bytes));
		    X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes),
				 nj, s0_bytes, ni, s1_bytes, 
				 MFC_GET_CMD);
		    X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes),
				 ni, s1_bytes, nj, s0_bytes, 
				 MFC_PUT_CMD);
	       } else /* if (j > i) */ {
		    Aalign = 
			 ALIGN_LIKE(A, t->A + (i * s1_bytes + j * s0_bytes));
		    Balign = 
			 ALIGN_LIKE(B, t->A + (j * s1_bytes + i * s0_bytes));
		    X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes),
				 nj, s0_bytes, ni, s1_bytes,
				 MFC_GET_CMD);
		    X(spu_dma2d)(Balign, t->A + (j * s1_bytes + i * s0_bytes),
				 ni, s0_bytes, nj, s1_bytes,
				 MFC_GET_CMD);
		    X(spu_dma2d)(Aalign, t->A + (j * s1_bytes + i * s0_bytes),
				 nj, s1_bytes, ni, s0_bytes, 
				 MFC_PUT_CMD);
		    X(spu_dma2d)(Balign, t->A + (i * s1_bytes + j * s0_bytes),
				 ni, s1_bytes, nj, s0_bytes, 
				 MFC_PUT_CMD);
	       }
	  }
     }
}
